---
title: "WhiteVote"
output: html_document
---

# Load packages and import data

```{r}

remove.packages(c("choroplethr", "htmlTable"))
install.packages('htmlTable', dependencies = TRUE)
install.packages('choroplethr', dependencies = TRUE)


# load the packages needed
p_needed <- c("reshape2","readxl","countrycode","DataCombine","tempdisagg","zoo","ggplot2","ggfortify","stringr","Hmisc","dplyr","Amelia","stargazer","haven","plm","lmtest","MASS","knitr","stats4", "acs", "WDI", "ggmap", "RgoogleMaps", "tigris", "rvest", "tidycensus", "choroplethr", "choroplethrMaps", "maps", "mapproj")
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
  install.packages(p_to_install)
}
sapply(p_needed, require, character.only = TRUE)
options(scipen=999)


# load the libraries needed
library(lmtest)
library(sandwich)
library(xtable)
library(lme4)
library(texreg)
library(haven)
library(dplyr)
library(ggplot2)
library(readxl)
library(reshape2)
library(choroplethr)
library(choroplethrMaps)
library(gridExtra)
library(knitr)
library(mapproj)

# import the datasets:
DataAll <- read_dta("DataAll.dta")

all <- read_dta("PresidentialElection.dta")

swing_states.orig <- read_excel(path="Subsample_SwingStates.xls") 

```


### FIGURE 1 ###

```{r}

#Calculating percentages for different groups and ACS answers
all$germantotal <- all$germantotal/all$totalpopulation*100
all$germansingle <- all$germansingle/all$totalpopulation*100
all$irishtotal <- all$irishtotal/all$totalpopulation*100
all$irishsingle <- all$irishsingle/all$totalpopulation*100
all$englishtotal <- all$englishtotal/all$totalpopulation*100
all$englishsingle <- all$englishsingle/all$totalpopulation*100
all$italiantotal <- all$italiantotal/all$totalpopulation*100
all$italiansingle <- all$italiansingle/all$totalpopulation*100
all$scandiviantotal <- all$scandiviantotal/all$totalpopulation*100
all$scandiviansingle <- all$scandiviansingle/all$totalpopulation*100
all$americantotal <- all$americantotal/all$totalpopulation*100
all$americansingle <- all$americansingle/all$totalpopulation*100
all$lutheran <- all$lutheran/all$totalpopulation*100
all$catholic <- all$catholic/all$totalpopulation*100
head(all)

  

### National Vote Share Republican
rep.national.agg <- all %>%
  group_by(yearvote) %>%
  dplyr::summarise(sum.rep.votes=sum(repvotes, na.rm=T), sum.total.votes=sum(totalvotes, na.rm=T)) 
rep.national.agg$prob.rep.vots <- (rep.national.agg$sum.rep.votes/rep.national.agg$sum.total.votes)
rep.national.agg$prob.rep.vots <- c(0.588,0.534,0.374,0.407,0.479,0.507,0.457,0.472,0.461)
rep.national.agg$group <- "National"  ### not all counties from  2016 Wahl in the 2011 census - leads to NAs
rep.national.agg

### American 
rep.American.agg <- all %>%
  filter(americantotal>=20) %>%
  group_by(yearvote) %>%
  dplyr::summarise(sum.rep.votes=sum(repvotes, na.rm=T), sum.total.votes=sum(totalvotes, na.rm=T))  #new column: votes per year
rep.American.agg$prob.rep.vots <- (rep.American.agg$sum.rep.votes/rep.American.agg$sum.total.votes)# percent
rep.American.agg$group <- "Unhyphenated American"  
rep.American.agg

### German                                                                                                                                                                                                                                                                                                                                                                                            sum(American[which(American$year.vote==2012),]$rep.votes,na.rm=TRUE)/sum(American[which(American$year.vote==2012),]$total.votes,na.rm=TRUE)*100,
rep.German.agg <- all %>% 
  filter(germantotal>=20) %>%
  group_by(yearvote) %>% 
  dplyr::summarise(sum.rep.votes=sum(repvotes, na.rm=T), sum.total.votes=sum(totalvotes, na.rm=T)) # new column: votes per year
rep.German.agg$prob.rep.vots <- (rep.German.agg$sum.rep.votes/rep.German.agg$sum.total.votes) # percent
rep.German.agg$group <- "German American"
rep.German.agg

df.rep.plot <- rbind(rep.German.agg[,c(1,4,5)], rep.American.agg[,c(1,4,5)], rep.national.agg[,c(1,4,5)])
df.rep.plot$order <- 1
df.rep.plot[which(df.rep.plot$group=="German American"),]$order <- 2
df.rep.plot[which(df.rep.plot$group=="Unhyphenated American"),]$order <- 3
df.rep.plot$group  <- with(df.rep.plot, reorder(group, order))


ggplot(data=df.rep.plot, mapping=aes(x=yearvote, y=prob.rep.vots, color=group)) +
  geom_line() +
  scale_x_continuous(breaks = c(1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016)) +
  scale_y_continuous(labels = scales::percent) +
  #theme_bw() + # white  background
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), # no grid no frame
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  labs(title="German and Unhyphenated Americans Compared", subtitle=paste0(" U.S. counties 2016 (excl. Alaska): 3114, counties with >=20% German American: 1295, counties with >=20% Unhyphenated: 355"), 
       y="Republican votes in %", x="election year", color="Legend") +
  theme(plot.subtitle=element_text(size=7, hjust=0.5, face="italic", color="black"),
        legend.position="bottom", legend.title = element_blank(), plot.title = element_text(hjust = 0.5))


### National Vote Share Democrat
dem.national.agg <- all %>%
  group_by(yearvote) %>%
  dplyr::summarise(sum.dem.votes=sum(demvotes, na.rm=T), sum.total.votes=sum(totalvotes, na.rm=T)) 
dem.national.agg$prob.dem.vots <- (dem.national.agg$sum.dem.votes/dem.national.agg$sum.total.votes)
dem.national.agg$prob.dem.vots <- c(0.406,0.456,0.430,0.492,0.484,0.483,0.529,0.511,0.482)
dem.national.agg$group <- "National" ###not all counties from  2016 election in the 2011 census - leads to NAs
dem.national.agg

### American 
dem.American.agg <- all %>%
  filter(americantotal>=20) %>%
  group_by(yearvote) %>%
  dplyr::summarise(sum.dem.votes=sum(demvotes, na.rm=T), sum.total.votes=sum(totalvotes, na.rm=T)) # new column: votes per year
dem.American.agg$prob.dem.vots <- (dem.American.agg$sum.dem.votes/dem.American.agg$sum.total.votes)# percent
dem.American.agg$group <- "Unhyphenated American"  
dem.American.agg

### German                                                                                                                                                                                                                                                                                                                                                                                            sum(American[which(American$year.vote==2012),]$dem.votes,na.rm=TRUE)/sum(American[which(American$year.vote==2012),]$total.votes,na.rm=TRUE)*100,
dem.German.agg <- all %>%
  filter(germantotal>=20) %>%
  group_by(yearvote) %>%
  dplyr::summarise(sum.dem.votes=sum(demvotes, na.rm=T), sum.total.votes=sum(totalvotes, na.rm=T)) # new column: votes per year
dem.German.agg$prob.dem.vots <- (dem.German.agg$sum.dem.votes/dem.German.agg$sum.total.votes) # percent
dem.German.agg$group <- "German American"
dem.German.agg

df.dem.plot <- rbind(dem.German.agg[,c(1,4,5)], dem.American.agg[,c(1,4,5)], dem.national.agg[,c(1,4,5)])
df.dem.plot$order <- 1
df.dem.plot[which(df.dem.plot$group=="German American"),]$order <- 2
df.dem.plot[which(df.dem.plot$group=="Unhyphenated American"),]$order <- 3
df.dem.plot$group  <- with(df.dem.plot, reorder(group, order))

df.dem.plot
ggplot(data=df.dem.plot, mapping=aes(x=yearvote, y=prob.dem.vots, color=group)) +
  geom_line() +
  scale_x_continuous(breaks = c(1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016)) +
  scale_y_continuous(labels = scales::percent) +
  #theme_bw() + # macht den Hintergrund weiß
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),  # no grid no frame
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  labs(title="German and Unhyphenated Americans Compared", subtitle=paste0("All U.S. counties 2016: 3152, counties with >=20% German American: 1295, counties with >=20% Unhyphenated: 355"), 
       y="Democratic votes in %", x="election year", color="Legend") +
  theme(plot.subtitle=element_text(size=7, hjust=0.5, face="italic", color="black"),
        legend.position="bottom", legend.title = element_blank(), plot.title = element_text(hjust = 0.5))


####### Combining the Plots #######

df.dem.plot$party <- "Democratic votes"
names(df.dem.plot) <- c("year.vote","prob.vots", "group", "order", "party")

df.rep.plot$party <- "Republican votes"
names(df.rep.plot) <- c("year.vote","prob.vots", "group", "order", "party")

df.plot <- rbind(df.dem.plot, df.rep.plot)
### Alternative
df.plot$party <- factor(df.plot$party, levels = c("Republican votes", "Democratic votes"))
### Alternative

# generate FIGURE 1
ggplot(data=df.plot, mapping=aes(x=year.vote, y=prob.vots, color=group)) +
  geom_line() +
  scale_x_continuous(breaks = c(1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016)) +
  scale_y_continuous(labels = scales::percent) +
  #theme_bw() + # white background
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),  # no grid no frame
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  labs(title="German and Unhyphenated Americans Compared", subtitle=paste0(" U.S. counties 2016 (excl. Alaska): 3114, counties with >=20% German American: 1295, counties with >=20% Unhyphenated: 355"), 
       y="votes in %", x="election year", color="Legend") +
  theme(plot.subtitle=element_text(size=7, hjust=0.5, face="italic", color="black"),
        legend.position="bottom", legend.title = element_blank(), plot.title = element_text(hjust = 0.5)) +
  facet_wrap(~ party) 

```

### FIGURE 2 ###

```{r}
### TABLE 3 MODEL 6 ###

### OBSERVED VALUE APPROACH ###


# Regression model:

# TABLE 3 Model 6
model6 <- glm(swing_state_dummy2016 ~ German_share_single2016 + American_share_single2016 + English_share_single2016 + Irish_share_single2016 + Italian_share_single2016 + NoCollegeDegree + IncomeDiffToNatMedianIncome + dummy_american_south2016, family = "binomial", data = DataAll)
summary(model6)


# Define variables
y <- DataAll$swing_state_dummy2016
X <- cbind(1, DataAll$German_share_single2016, DataAll$American_share_single2016, DataAll$English_share_single2016, DataAll$Irish_share_single2016, DataAll$Italian_share_single2016, DataAll$NoCollegeDegree, DataAll$IncomeDiffToNatMedianIncome, DataAll$dummy_american_south2016)
X <- na.omit(X)

# Step 1
# Get coefficients and the variance-covariance matrix for the simulation
mu <- model6$coefficients[1:9]
varcov <- vcovHC(model6, "HC1")
nsim <- 1000

# Simulate estimation uncertainty
set.seed(54321)

library(MASS)
S <- mvrnorm(nsim, mu, varcov)

# Step 2 and 3
# Observed Value Approach
ovaP <- apply(S, 1, function(s) mean(1 / (1 + exp(- X %*% s))))

hist(ovaP, breaks=20,
     main = "Histogram of Observed Value Predicted Probabilities",
     xlab = "Predicted Probability", las = 1,
     col = "lightgrey", border = "white")


##### range of values #####

Scenario <- seq(min(DataAll$German_share_single2016,na.rm=T), max(DataAll$German_share_single2016, na.rm=T), 0.5)

cases <- array(NA, c(dim(X), length(Scenario)))

cases[, , ] <- X

for (i in 1:length(Scenario)){
  cases[, 2, i] <- Scenario[i]  
}

val <- matrix(NA, nrow = nsim, ncol = length(Scenario))

for(i in 1:length(Scenario)){
  val[, i] <- apply(S, 1, function(s)  mean(1 / (1 + exp(-cases[, , i] %*% s))))
}  


df <- data.frame(t(apply(val, 2, quantile, c(0.975, 0.025))))
df$mean <- apply(val, 2, mean)
df$Quantile <- seq(min(DataAll$German_share_single2016,na.rm=T), max(DataAll$German_share_single2016, na.rm=T), 0.5)
names(df)[1:2] <- c("upper","lower")

# FIGURE 2
ggplot(df,aes(x = Quantile, y = mean, ymin = lower, ymax = upper)) + 
  ylab("Probability County lies in Dem to Rep Swinging State") + 
  scale_x_continuous(limits = c(0, 48)) + # ,breaks = seq(0, 48, 5)
  xlab("German ancestry share") + 
  ggtitle("Observed value approach for a range of German ancestry share values") + 
  geom_ribbon(aes(ymin=lower, ymax=upper), fill = "black", colour = "grey", linetype = "dotted", alpha=.2) + 
  theme_bw() + 
  geom_line(aes(y = mean), colour = "black")  + 
  scale_y_continuous(limits = c(0, 0.8), breaks = seq(0, 0.8, 0.1)) +
  theme(panel.grid = element_blank()) +
  geom_rug(data = DataAll, aes(x = German_share_single2016), inherit.aes = F, size=0.3, sides="b")

```

### FIGURE 3 ###

```{r}

### Background idea of map: https://www.r-bloggers.com/mapping-election-results-with-r-and-choroplethr/


fl <- read.delim("http://fldoselectionfiles.elections.myflorida.com/enightfilespublic/20160315_ElecResultsFL.txt", strip.white = T)

library(lmtest)
library(sandwich)
library(xtable)
library(lme4)
library(texreg)
library(haven)
library(dplyr)
library(ggplot2)
library(readxl)
library(reshape2)
library(choroplethr)
library(choroplethrMaps)
library(gridExtra)
library(knitr)
library(mapproj)

# Filter leaving only one party, and select desired columns
dem <- filter(fl, PartyCode == "DEM") %>% select(CountyName, CanNameLast, CanVotes) # if this line does not run without an error please do re-start R and run this last chunk again

# Cast dem dataframe from long to wide using dcast 
dem_cast <- dcast(dem, CountyName ~ CanNameLast, sum)  # Now we can see each candidate's votes per county
colnames(dem_cast)[3] <- "OMalley" # Remove apostrophe from O'Malley

# Change CountyName column from Factor to lowercase Character  
dem_cast$CountyName <- tolower(as.character(dem_cast$CountyName))  

# Create columns for total votes in each county
dem_cast <- mutate(dem_cast, total = Clinton + OMalley + Sanders)

# Create columns for percentage variables
dem_cast <- mutate(dem_cast, hc = (Clinton/total)*100, bs = (Sanders/total)*100, mo = (OMalley/total)*100)
dem_cast[,6:8] <- round(dem_cast[,6:8], digits = 1)  # Round new variables to 1 decimal place

# Read county.regions dataframe supplied by choroplethrMaps package
data("county.regions")

# Filter leaving only florida counties, and select only the 2 needed columns
fl.regions <- filter(county.regions, state.name == "florida") %>% select(region, "CountyName" = county.name)

#cl all of USA
names(county.regions)[3] <- "CountyName"

# Join regions column from fl.regions dataframe to election results dataframe
df <- left_join(dem_cast, fl.regions)

bs.counties <- filter(df, Sanders > Clinton & Sanders > OMalley)
kable(bs.counties, caption = "Counties won by Sanders")

hc.counties <- filter(df, Clinton > Sanders & Clinton > OMalley)
kable(hc.counties, caption = "Counties won by Clinton")

# For each candidate, map the percent of each counties' total vote using choroplethr package 
df$value = df$bs  # Set the desired 'value' column for choroplethr
choro_bs = county_choropleth(df, state_zoom="florida", legend = "%", num_colors=1) + 
  ggtitle("Bernie Sanders") +
  coord_map()  # Adds a Mercator projection
choro_bs

df$value = df$hc  # Set the desired 'value' column for choroplethr
choro_hc = county_choropleth(df, state_zoom="florida", legend = "%", num_colors=1) + 
  ggtitle("Hillary Clinton") +
  coord_map()
choro_hc

df$value = df$mo  # Set the desired 'value' column for choroplethr
choro_mo = county_choropleth(df, state_zoom="florida", legend = "%", num_colors=1) + 
  ggtitle("Martin O'Malley") +
  coord_map()
choro_mo

# Plot all three maps in a grid
grid.arrange(choro_hc, choro_bs, choro_mo, ncol=3, top = "Florida Democratic Primary 2016n Percent of Total Votes by Countyn ")


# Function for highlighting a county
highlight_county = function(county_fips)
{
  library(choroplethrMaps)
  data(county.map, package="choroplethrMaps", envir=environment())
  df = county.map[county.map$region %in% county_fips, ]
  geom_polygon(data=df, aes(long, lat, group = group), color = "yellow", fill = NA, size = 0.5)
}
# Filter counties won by Sanders
bs.counties <- filter(df, Sanders > Clinton & Sanders > OMalley)

# Create list of counties won
bs.fips <- bs.counties[[9]]

# Map using the highlight_county() function after calling county_choropleth()
df$value = df$bs  # Set the desired 'value' column for choroplethr
choro_bs = county_choropleth(df, state_zoom="florida", legend = "%", num_colors=1) + 
  highlight_county(bs.fips) +  # Highlight counties won
  ggtitle("Bernie Sanders") +
  coord_map()  # Adds a Mercator projection
choro_bs

# Add a new column to show each county's winner
df$winner <- as.factor(ifelse(df$hc > df$bs, "Clinton", "Sanders"))

# Plot of winner by county
df$value = df$winner  # Set the desired 'value' column for choroplethr 
choro_winner = county_choropleth(df, state_zoom="florida", legend = "Winner", num_colors=2) +   
  ggtitle("Florida Presidential Primaryn 15 March 2016") +   
  coord_map() 
choro_winner

############################# Swing from the Ethnic Back-Stage: Swing County Map ############################# 

# Only the five Midwestern Swing states
swing_states <- filter(swing_states.orig, state %in% c("Pennsylvania", "Wisconsin", "Iowa", "Ohio", "Michigan")) %>% select(fips, state, county, GermanFirstHighest2016, var22016)
# Did the county swing?
swing_states$swing <- "yes"
# label change
names(swing_states)[1] <- "region"

# Only the five Midwestern Swing states
swing_all_2016 <- filter(all, yearvote == 2016 & state %in% c("Pennsylvania", "Wisconsin", "Iowa", "Ohio", "Michigan")) %>% select(fips,state, county, dempercent, reppercent)
# label change
names(swing_all_2016)[1] <- "region"

## merge the tables
df <- merge(x = swing_all_2016, y = swing_states, by = "region", all = TRUE)
df <- df[,c(1,2,3,4,5,8,9,10)]
# NAs cannot come from swing-county table so: "no"
df[which(is.na(df$swing)),]$swing <- "no"

# new column for plot, combination of dimensions:
## swing and non-swing plus within the swing states: ethnic composition
df$ethn_swing <- NA
# Column AW==1 and BO==3 -> German dominant
df[which(df$GermanFirstHighest2016==1 & df$var22016==3),]$ethn_swing <- "German dominant"
# Column AW==1 and BO!=3 -> German non-dominant
df[which(df$GermanFirstHighest2016==1 & df$var22016!=3),]$ethn_swing <- "German non-dominant"
# Column AW != 1 other dominant
df[which(df$GermanFirstHighest2016!=1),]$ethn_swing <- "other"
# copy "no" in "swing" into our column
df[which(df$swing=="no"),]$ethn_swing <- "no swing"
# Coding factors and order of factor level
df$ethn_swing <- as.factor(df$ethn_swing)
df$ethn_swing <- factor(df$ethn_swing, levels = c("German dominant", "German non-dominant", "other", "no swing"))

# control
table(df$ethn_swing)
# result: 36 German dominant, 32 German non-dominant, 11 other, 330 no swing counties

# Plot
df$value = df$ethn_swing  # Set the desired 'value' column for choroplethr 
swing = county_choropleth(df, state_zoom=c("wisconsin", "pennsylvania", "ohio", "iowa", "michigan") , legend = "ethn_swing", num_colors=4) +
  scale_fill_manual(values = c("#525252", "#969696", "#cccccc", "white"), name="County categories:") +
  theme(legend.position="bottom") +
  coord_map() 

# FIGURE 3
swing

```


